Preparing the environment

Load the required packages:

library(tidyverse)
library(glue)
library(knitr)
library(Matrix)
library(scRNAseq)
library(scater)
library(scran)
library(dittoSeq)
library(reticulate)
library(zellkonverter)

Create a conda environment containing the required Python packages:

if (!file.exists(miniconda_path())) {
    install_miniconda()
}
conda_install("r-cellrank", c("cellrank-krylov", "python-igraph"),
              python_version = "3.8", channel = c("conda-forge", "bioconda"))

Miniconda installation path:

miniconda_path()
## [1] "/home/rstudio/.local/share/r-miniconda"

List the available conda environments:

conda_list()
name python
0 /home/rstudio/.cache/R/basilisk/1.4.0/0/bin/python
zellkonverterAnnDataEnv /home/rstudio/.cache/R/basilisk/1.4.0/zellkonverter/1.2.1/zellkonverterAnnDataEnv/bin/python
r-miniconda /home/rstudio/.local/share/r-miniconda/bin/python
r-cellrank /home/rstudio/.local/share/r-miniconda/envs/r-cellrank/bin/python
r-reticulate /home/rstudio/.local/share/r-miniconda/envs/r-reticulate/bin/python

Use the created conda environment:

use_condaenv("r-cellrank", required = TRUE)

Preparing the data

# Load the Hermann et al. (2018) mouse spermatogenic cells dataset
sce <- HermannSpermatogenesisData()
sce
## class: SingleCellExperiment 
## dim: 54448 2325 
## metadata(0):
## assays(2): spliced unspliced
## rownames(54448): ENSMUSG00000102693.1 ENSMUSG00000064842.1 ...
##   ENSMUSG00000064369.1 ENSMUSG00000064372.1
## rowData names(0):
## colnames(2325): CCCATACTCCGAAGAG AATCCAGTCATCTGCC ... ATCCACCCACCACCAG
##   ATTGGTGGTTACCGAT
## colData names(1): celltype
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
# Fix row names
rownames(sce) <- str_remove(rownames(sce), ".\\d+$")
# Cell metadata
colData(sce)
## DataFrame with 2325 rows and 1 column
##                                celltype
##                             <character>
## CCCATACTCCGAAGAG DIplotene/Secondary ..
## AATCCAGTCATCTGCC DIplotene/Secondary ..
## GACTGCGGTTTGACTG DIplotene/Secondary ..
## ACACCAATCTTCGGTC DIplotene/Secondary ..
## TGACAACAGGACAGAA   Mid Round spermatids
## ...                                 ...
## CCCTCCTCATCGGACC                     NA
## GAAACTCCACTATCTT                     NA
## AACGTTGTCATCGATG                     NA
## ATCCACCCACCACCAG                     NA
## ATTGGTGGTTACCGAT                     NA
# Cell type summary
sce$celltype %>%
  fct_explicit_na(na_level = "NA") %>%
    table() %>%
    enframe() %>%
    arrange(desc(value))
name value
Mid Round spermatids 878
NA 447
Late Round spermatids 423
Early Round spermatids 410
DIplotene/Secondary spermatocytes 136
Pachytene spermatocytes 10
A3-A4-In-B Differentiating spermatogonia 7
Leptotene/Zygotene spermatocytes 7
Preleptotene spermatocytes 3
Sertoli cells 2
A1-A2 Differentiating spermatogonia 1
Undifferentiated spermatogonia 1
# Prepare the cell type labels
sce$cell_type <- sce$celltype %>%
  fct_explicit_na(na_level = "Other") %>%
  str_remove(" spermatids") %>%
    fct_lump_min(min = 400, other_level = "Other") %>%
    fct_relevel("Early Round", "Mid Round", "Late Round", "Other") %>%
    fct_drop()
table(sce$cell_type)
## 
## Early Round   Mid Round  Late Round       Other 
##         410         878         423         614

Basic analysis

# Set the RNG seed for reproducible results
set.seed(42)
# Data normalization
quick_clusters <- quickCluster(sce, assay.type = "spliced")
table(quick_clusters)
## quick_clusters
##   1   2   3   4   5   6   7   8   9  10  11  12  13 
## 133 139 244 254 176 117 196 177 157 201 213 197 121
sce <- computeSumFactors(sce, clusters = quick_clusters, assay.type = "spliced")
sce <- logNormCounts(sce, assay.type = "spliced")
# Selecting highly variable genes
dec <- modelGeneVar(sce)
top_hvgs <- getTopHVGs(dec, fdr.threshold = 0.05)
length(top_hvgs)
## [1] 975
# Principal component analysis
sce <- denoisePCA(sce, dec, subset.row = top_hvgs, min.rank = 5, max.rank = 50)
ncol(reducedDim(sce, "PCA"))
## [1] 5
# Dimensionality reduction by UMAP
sce <- runUMAP(sce, dimred = "PCA")

UMAP plots

dittoDimPlot(sce, "cell_type", reduction.use = "UMAP", size = 0.5,
             legend.show = FALSE, do.label = TRUE,
             labels.highlight = FALSE, labels.size = 4)

dittoDimPlot(sce, "cell_type", reduction.use = "UMAP", size = 0.1,
             split.by = "cell_type", split.ncol = 2, legend.show = FALSE)

Preparing the anndata object

sce_adata <- SingleCellExperiment(
  assays = list(
    counts = assay(sce, "spliced"),
    spliced = assay(sce, "spliced"),
    unspliced = assay(sce, "unspliced")
  ),
  colData = DataFrame(
    cell_type = sce$cell_type
  ),
  reducedDims = list(
    X_umap = reducedDim(sce, "UMAP")
  )
)
sce
## class: SingleCellExperiment 
## dim: 54448 2325 
## metadata(0):
## assays(3): spliced unspliced logcounts
## rownames(54448): ENSMUSG00000102693 ENSMUSG00000064842 ...
##   ENSMUSG00000064369 ENSMUSG00000064372
## rowData names(0):
## colnames(2325): CCCATACTCCGAAGAG AATCCAGTCATCTGCC ... ATCCACCCACCACCAG
##   ATTGGTGGTTACCGAT
## colData names(3): celltype cell_type sizeFactor
## reducedDimNames(2): PCA UMAP
## mainExpName: NULL
## altExpNames(0):
adata <- SCE2AnnData(sce_adata)
adata
## AnnData object with n_obs × n_vars = 2325 × 54448
##     obs: 'cell_type'
##     uns: 'X_name'
##     obsm: 'X_umap'
##     layers: 'spliced', 'unspliced'

RNA velocity analysis using scVelo

Preparing the data

Load the required Python modules:

import scanpy as sc
import scvelo as scv
import cellrank as cr
import matplotlib.pyplot as plt
scv.settings.verbosity = 0
cr.settings.verbosity = 0

Get the anndata object:

adata = r.adata

Pie chart of spliced/unspliced proportions:

scv.pl.proportions(adata, groupby='cell_type')

Preprocess the data:

scv.pp.filter_and_normalize(adata, n_top_genes=2000, enforce=True)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)

RNA velocity

RNA velocity analysis using the dynamical model:

scv.tl.recover_dynamics(adata, n_jobs = 4)
scv.tl.velocity(adata, mode='dynamical')
scv.tl.velocity_graph(adata)

Stream plot of velocities on the UMAP embedding:

scv.pl.velocity_embedding_stream(adata, basis='umap', color='cell_type')

Speed and coherence

scv.tl.velocity_confidence(adata)
scv.pl.scatter(adata, basis='umap', color='velocity_length',
               cmap='coolwarm', perc=[5, 95])

scv.pl.scatter(adata, basis='umap', color='velocity_confidence',
               cmap='coolwarm', perc=[5, 95])

Velocity graph and pseudotime

scv.tl.velocity_pseudotime(adata)
scv.pl.velocity_graph(adata, basis='umap', color='cell_type', threshold=0.1,
                      legend_loc='best')

scv.pl.scatter(adata, basis='umap', color='velocity_pseudotime',
               cmap='gnuplot')

PAGA velocity graph

scv.tl.paga(adata, groups='cell_type')
scv.pl.paga(adata, basis='umap', size=50, alpha=0.1, min_edge_width=2,
            node_size_scale=1.5, legend_loc='best')

Differential velocity analysis

scv.tl.rank_velocity_genes(adata, groupby='cell_type', min_corr=0.3)
diff_velo_df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
head(py$diff_velo_df)
Early Round Mid Round Late Round Other
ENSMUSG00000054910 ENSMUSG00000096405 ENSMUSG00000087524 ENSMUSG00000031770
ENSMUSG00000028701 ENSMUSG00000086732 ENSMUSG00000040446 ENSMUSG00000061607
ENSMUSG00000028248 ENSMUSG00000109981 ENSMUSG00000024500 ENSMUSG00000028132
ENSMUSG00000110611 ENSMUSG00000097325 ENSMUSG00000099684 ENSMUSG00000020328
ENSMUSG00000097392 ENSMUSG00000034040 ENSMUSG00000104111 ENSMUSG00000022462
ENSMUSG00000100684 ENSMUSG00000030276 ENSMUSG00000110629 ENSMUSG00000026743
scv.pl.velocity(adata, diff_velo_df.iloc[0, :], color='cell_type', ncols = 1)

Inspecting the anndata object

adata
## AnnData object with n_obs × n_vars = 2325 × 2000
##     obs: 'cell_type', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts', 'velocity_self_transition', 'velocity_length', 'velocity_confidence', 'velocity_confidence_transition', 'root_cells', 'end_points', 'velocity_pseudotime'
##     var: 'gene_count_corr', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'fit_r2', 'fit_alpha', 'fit_beta', 'fit_gamma', 'fit_t_', 'fit_scaling', 'fit_std_u', 'fit_std_s', 'fit_likelihood', 'fit_u0', 'fit_s0', 'fit_pval_steady', 'fit_steady_u', 'fit_steady_s', 'fit_variance', 'fit_alignment_scaling', 'velocity_genes', 'spearmans_score', 'velocity_score'
##     uns: 'X_name', 'pca', 'neighbors', 'recover_dynamics', 'velocity_params', 'velocity_graph', 'velocity_graph_neg', 'cell_type_colors', 'paga', 'cell_type_sizes', 'rank_velocity_genes'
##     obsm: 'X_umap', 'X_pca', 'velocity_umap'
##     varm: 'PCs', 'loss'
##     layers: 'spliced', 'unspliced', 'Ms', 'Mu', 'fit_t', 'fit_tau', 'fit_tau_', 'velocity', 'velocity_u'
##     obsp: 'distances', 'connectivities'

Cell fate mapping using Cellrank

Data processing using VelocityKernel

Calculating the transition matrix:

vtk = cr.tl.kernels.VelocityKernel(adata)
vtk.compute_transition_matrix()

Plot the UMAP projection of the transition matrix:

vtk.compute_projection(basis='umap')
scv.pl.velocity_embedding_stream(adata, basis='umap', color='cell_type',
                                 vkey='T_fwd')

Macrostate analysis

Computing Schur decomposition:

gpcaa = cr.tl.estimators.GPCCA(vtk)
gpcaa.compute_schur(n_components=10)

Top eigenvalues plot:

gpcaa.plot_spectrum(real_only=True)
plt.show()

Computing macrostates:

gpcaa.compute_macrostates(n_states=2, cluster_key='cell_type')
gpcaa.plot_macrostates()

Computing terminal states:

gpcaa.compute_terminal_states()
gpcaa.plot_terminal_states()

Absorption probabilities

Calculating absorption probabilities:

gpcaa.compute_absorption_probabilities()

Plot the absorption probabilities:

gpcaa.plot_absorption_probabilities(legend_loc='best')

gpcaa.plot_absorption_probabilities(same_plot=False, ncols=1)

Inspecting the anndata object

adata
## AnnData object with n_obs × n_vars = 2325 × 2000
##     obs: 'cell_type', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts', 'velocity_self_transition', 'velocity_length', 'velocity_confidence', 'velocity_confidence_transition', 'root_cells', 'end_points', 'velocity_pseudotime', 'clusters_gradients', 'terminal_states', 'terminal_states_probs'
##     var: 'gene_count_corr', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'fit_r2', 'fit_alpha', 'fit_beta', 'fit_gamma', 'fit_t_', 'fit_scaling', 'fit_std_u', 'fit_std_s', 'fit_likelihood', 'fit_u0', 'fit_s0', 'fit_pval_steady', 'fit_steady_u', 'fit_steady_s', 'fit_variance', 'fit_alignment_scaling', 'velocity_genes', 'spearmans_score', 'velocity_score'
##     uns: 'X_name', 'pca', 'neighbors', 'recover_dynamics', 'velocity_params', 'velocity_graph', 'velocity_graph_neg', 'cell_type_colors', 'paga', 'cell_type_sizes', 'rank_velocity_genes', 'T_fwd_params', 'schur_matrix_fwd', 'eigendecomposition_fwd', 'coarse_fwd', 'clusters_gradients_colors', 'terminal_states_colors'
##     obsm: 'X_umap', 'X_pca', 'velocity_umap', 'T_fwd_umap', 'schur_vectors_fwd', 'terminal_states_memberships', 'to_terminal_states'
##     varm: 'PCs', 'loss'
##     layers: 'spliced', 'unspliced', 'Ms', 'Mu', 'fit_t', 'fit_tau', 'fit_tau_', 'velocity', 'velocity_u'
##     obsp: 'distances', 'connectivities'

Saving the anndata object

adata.uns['terminal_states_names'] = adata.obsm['to_terminal_states'].names
del adata.uns['coarse_fwd']
adata.write('rna_velocity_scvelo_hermann_2018.h5ad', compression='gzip')

Getting back to R

Read the h5ad file:

sce_cellrank <- readH5AD("rna_velocity_scvelo_hermann_2018.h5ad")
sce_cellrank
## class: SingleCellExperiment 
## dim: 2000 2325 
## metadata(15): T_fwd_params cell_type_colors ... velocity_graph_neg
##   velocity_params
## assays(10): counts Ms ... velocity velocity_u
## rownames(2000): ENSMUSG00000025903 ENSMUSG00000097171 ...
##   ENSMUSG00000064367 ENSMUSG00000064370
## rowData names(25): gene_count_corr means ... velocity_score varm
## colnames(2325): CCCATACTCCGAAGAG AATCCAGTCATCTGCC ... ATCCACCCACCACCAG
##   ATTGGTGGTTACCGAT
## colData names(15): cell_type initial_size_spliced ... terminal_states
##   terminal_states_probs
## reducedDimNames(7): T_fwd_umap X_pca ... to_terminal_states
##   velocity_umap
## mainExpName: NULL
## altExpNames(0):

Get terminal state absorption probabilities for each cell:

absorption_probabilities <- setNames(
  as.data.frame(reducedDim(sce_cellrank, "to_terminal_states")),
  metadata(sce_cellrank)$terminal_states_names
)
head(absorption_probabilities)
Late Round Other
CCCATACTCCGAAGAG 0.7484622 0.2515375
AATCCAGTCATCTGCC 0.7509432 0.2490566
GACTGCGGTTTGACTG 0.7514131 0.2485865
ACACCAATCTTCGGTC 0.7471087 0.2528910
TGACAACAGGACAGAA 0.7650655 0.2349341
TTGGAACAGGCGTACA 0.7670970 0.2329027

Session Info

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] zellkonverter_1.2.1         reticulate_1.22            
##  [3] dittoSeq_1.4.3              scran_1.20.1               
##  [5] scater_1.20.1               scuttle_1.2.1              
##  [7] scRNAseq_2.6.1              SingleCellExperiment_1.14.1
##  [9] SummarizedExperiment_1.22.0 Biobase_2.52.0             
## [11] GenomicRanges_1.44.0        GenomeInfoDb_1.28.4        
## [13] IRanges_2.26.0              S4Vectors_0.30.2           
## [15] BiocGenerics_0.38.0         MatrixGenerics_1.4.3       
## [17] matrixStats_0.61.0          Matrix_1.3-4               
## [19] knitr_1.36                  glue_1.4.2                 
## [21] forcats_0.5.1               stringr_1.4.0              
## [23] dplyr_1.0.7                 purrr_0.3.4                
## [25] readr_2.0.2                 tidyr_1.1.4                
## [27] tibble_3.1.5                ggplot2_3.3.5              
## [29] tidyverse_1.3.1            
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                    tidyselect_1.1.1             
##   [3] RSQLite_2.2.8                 AnnotationDbi_1.54.1         
##   [5] grid_4.1.0                    BiocParallel_1.26.2          
##   [7] munsell_0.5.0                 ScaledMatrix_1.0.0           
##   [9] statmod_1.4.36                withr_2.4.2                  
##  [11] colorspace_2.0-2              filelock_1.0.2               
##  [13] highr_0.9                     rstudioapi_0.13              
##  [15] labeling_0.4.2                GenomeInfoDbData_1.2.6       
##  [17] bit64_4.0.5                   farver_2.1.0                 
##  [19] pheatmap_1.0.12               rprojroot_2.0.2              
##  [21] basilisk_1.4.0                vctrs_0.3.8                  
##  [23] generics_0.1.0                xfun_0.26                    
##  [25] BiocFileCache_2.0.0           R6_2.5.1                     
##  [27] ggbeeswarm_0.6.0              rsvd_1.0.5                   
##  [29] locfit_1.5-9.4                AnnotationFilter_1.16.0      
##  [31] bitops_1.0-7                  cachem_1.0.6                 
##  [33] DelayedArray_0.18.0           assertthat_0.2.1             
##  [35] promises_1.2.0.1              BiocIO_1.2.0                 
##  [37] scales_1.1.1                  beeswarm_0.4.0               
##  [39] gtable_0.3.0                  beachmat_2.8.1               
##  [41] ensembldb_2.16.4              rlang_0.4.11                 
##  [43] rtracklayer_1.52.1            lazyeval_0.2.2               
##  [45] broom_0.7.9                   BiocManager_1.30.16          
##  [47] yaml_2.2.1                    modelr_0.1.8                 
##  [49] GenomicFeatures_1.44.2        backports_1.2.1              
##  [51] httpuv_1.6.3                  tools_4.1.0                  
##  [53] ellipsis_0.3.2                jquerylib_0.1.4              
##  [55] RColorBrewer_1.1-2            ggridges_0.5.3               
##  [57] Rcpp_1.0.7                    plyr_1.8.6                   
##  [59] sparseMatrixStats_1.4.2       progress_1.2.2               
##  [61] zlibbioc_1.38.0               RCurl_1.98-1.5               
##  [63] basilisk.utils_1.4.0          prettyunits_1.1.1            
##  [65] viridis_0.6.1                 cowplot_1.1.1                
##  [67] haven_2.4.3                   ggrepel_0.9.1                
##  [69] cluster_2.1.2                 fs_1.5.0                     
##  [71] here_1.0.1                    magrittr_2.0.1               
##  [73] reprex_2.0.1                  ProtGenerics_1.24.0          
##  [75] hms_1.1.1                     mime_0.12                    
##  [77] evaluate_0.14                 xtable_1.8-4                 
##  [79] XML_3.99-0.8                  readxl_1.3.1                 
##  [81] gridExtra_2.3                 compiler_4.1.0               
##  [83] biomaRt_2.48.3                crayon_1.4.1                 
##  [85] htmltools_0.5.2               later_1.3.0                  
##  [87] tzdb_0.1.2                    lubridate_1.8.0              
##  [89] DBI_1.1.1                     ExperimentHub_2.0.0          
##  [91] dbplyr_2.1.1                  rappdirs_0.3.3               
##  [93] cli_3.0.1                     metapod_1.0.0                
##  [95] igraph_1.2.6                  pkgconfig_2.0.3              
##  [97] GenomicAlignments_1.28.0      dir.expiry_1.0.0             
##  [99] xml2_1.3.2                    vipor_0.4.5                  
## [101] bslib_0.3.1                   dqrng_0.3.0                  
## [103] XVector_0.32.0                rvest_1.0.1                  
## [105] digest_0.6.28                 Biostrings_2.60.2            
## [107] rmarkdown_2.11                cellranger_1.1.0             
## [109] edgeR_3.34.1                  DelayedMatrixStats_1.14.3    
## [111] restfulr_0.0.13               curl_4.3.2                   
## [113] shiny_1.7.1                   Rsamtools_2.8.0              
## [115] rjson_0.2.20                  lifecycle_1.0.1              
## [117] jsonlite_1.7.2                BiocNeighbors_1.10.0         
## [119] viridisLite_0.4.0             limma_3.48.3                 
## [121] fansi_0.5.0                   pillar_1.6.3                 
## [123] lattice_0.20-44               KEGGREST_1.32.0              
## [125] fastmap_1.1.0                 httr_1.4.2                   
## [127] interactiveDisplayBase_1.30.0 png_0.1-7                    
## [129] bluster_1.2.1                 BiocVersion_3.13.1           
## [131] bit_4.0.4                     stringi_1.7.5                
## [133] sass_0.4.0                    blob_1.2.2                   
## [135] BiocSingular_1.8.1            AnnotationHub_3.0.1          
## [137] memoise_2.0.0                 irlba_2.3.3